library(rpart)
library(mclust)
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
library(parallel)
library(e1071)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
zip = read.csv("/course/data/2024-yong/lab07/zip.csv")
dim(zip)
## [1] 9298 257
data <- zip[zip$digit == 6 | zip$digit == 8, ]
str(data)
## 'data.frame': 1542 obs. of 257 variables:
## $ digit: int 6 8 8 6 6 6 6 6 8 6 ...
## $ p1 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p2 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p3 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p4 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.599 -1 ...
## $ p5 : num -1 -1 -0.837 -1 -1 -1 -1 -1 0.83 -1 ...
## $ p6 : num -1 -1 -0.303 -1 -1 -1 -1 -1 1 -1 ...
## $ p7 : num -1 -1 0.48 -1 -1 -1 -1 -1 1 -1 ...
## $ p8 : num -1 -0.936 0.764 -0.458 -0.048 -0.996 -1 -1 0.904 -0.835 ...
## $ p9 : num -0.72 -0.123 0.611 0.903 0.989 0.275 -0.192 -0.832 -0.542 0.858 ...
## $ p10 : num 0.525 0.861 0.394 0.964 -0.282 -0.165 0.151 0.782 -1 -0.543 ...
## $ p11 : num -0.392 -0.066 -0.361 -0.407 -1 -1 -1 0.189 -1 -1 ...
## $ p12 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p13 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p14 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p15 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p16 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p17 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p18 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p19 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.533 -1 ...
## $ p20 : num -1 -1 -0.553 -1 -1 -1 -1 -1 0.852 -1 ...
## $ p21 : num -1 -0.941 0.998 -1 -1 -1 -1 -1 1 -1 ...
## $ p22 : num -1 0.374 1 -1 -1 -1 -1 -1 1 -1 ...
## $ p23 : num -1 0.647 1 -0.104 -0.645 -1 -1 -1 1 -1 ...
## $ p24 : num -0.961 0.823 1 0.999 0.997 -0.444 -0.896 -0.868 1 -0.279 ...
## $ p25 : num 0.716 0.941 1 1 0.996 0.835 0.84 0.518 0.921 1 ...
## $ p26 : num 0.298 0.506 1 0.994 -0.511 -0.667 -0.253 1 0.818 -0.527 ...
## $ p27 : num -0.979 1 1 -0.296 -1 -1 -1 0.581 0.818 -1 ...
## $ p28 : num -1 -0.637 0.055 -1 -1 -1 -1 -1 0.152 -1 ...
## $ p29 : num -1 -1 -1 -1 -1 -1 -1 -1 0.091 -1 ...
## $ p30 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.608 -1 ...
## $ p31 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p32 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p33 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p34 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.98 -1 ...
## $ p35 : num -1 -1 -0.8 -1 -1 -1 -1 -1 0.461 -1 ...
## $ p36 : num -1 -1 0.834 -1 -1 -1 -1 -1 1 -1 ...
## $ p37 : num -1 -0.303 1 -0.961 -1 -1 -1 -1 0.836 -1 ...
## $ p38 : num -1 1 1 0.37 -1 -1 -1 -1 -0.485 -1 ...
## $ p39 : num -1 0.498 0.3 0.979 0.216 -1 -1 -1 0.202 -0.795 ...
## $ p40 : num 0.045 0.414 -0.338 0.936 1 0.579 -0.338 0.115 1 0.757 ...
## $ p41 : num 0.569 0.919 -0.744 0.05 0.352 0.187 0.912 1 1 0.774 ...
## $ p42 : num -0.903 -0.018 -0.418 -0.79 -0.996 -1 -0.883 0.767 1 -0.981 ...
## $ p43 : num -1 1 0.305 -1 -1 -1 -1 -0.597 1 -1 ...
## $ p44 : num -1 -0.383 -0.118 -1 -1 -1 -1 -1 1 -1 ...
## $ p45 : num -1 -1 0.267 -1 -1 -1 -1 -1 1 -1 ...
## $ p46 : num -1 -1 -0.164 -1 -1 -1 -1 -1 0.806 -1 ...
## $ p47 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.751 -1 ...
## $ p48 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p49 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p50 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.754 -1 ...
## $ p51 : num -1 -1 0.375 -1 -1 -1 -1 -1 1 -1 ...
## $ p52 : num -1 -1 1 -1 -1 -1 -1 -1 1 -1 ...
## $ p53 : num -1 -0.558 0.817 0.311 -1 -1 -1 -1 0.043 -1 ...
## $ p54 : num -1 1 -0.454 1 -0.399 -1 -1 -1 -1 -1 ...
## $ p55 : num -0.568 -0.148 -0.987 0.935 0.97 -0.264 -1 -0.664 -0.773 0.114 ...
## $ p56 : num 0.961 -1 -1 -0.408 0.94 0.986 0.589 0.976 0.508 0.998 ...
## $ p57 : num -0.55 -0.601 -1 -1 -0.714 -0.725 0.402 0.695 1 -0.276 ...
## $ p58 : num -1 0.628 -1 -1 -1 -1 -1 -0.808 0.544 -1 ...
## $ p59 : num -1 1 -0.817 -1 -1 -1 -1 -1 -0.454 -1 ...
## $ p60 : num -1 -0.349 0.442 -1 -1 -1 -1 -1 -0.454 -1 ...
## $ p61 : num -1 -1 1 -1 -1 -1 -1 -1 0.663 -1 ...
## $ p62 : num -1 -1 0.5 -1 -1 -1 -1 -1 1 -1 ...
## $ p63 : num -1 -1 -1 -1 -1 -1 -1 -1 0.009 -1 ...
## $ p64 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p65 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p66 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.642 -1 ...
## $ p67 : num -1 -1 0.215 -1 -1 -1 -1 -1 1 -1 ...
## $ p68 : num -1 -1 1 -0.236 -1 -1 -1 -1 1 -1 ...
## $ p69 : num -1 -1 0.035 0.965 -0.855 -1 -1 -1 -0.068 -1 ...
## $ p70 : num -0.998 0.551 -1 1 0.839 -0.814 -1 -0.998 -1 -0.852 ...
## $ p71 : num 0.489 0.703 -1 0.109 1 0.798 -0.636 0.581 -1 0.86 ...
## $ p72 : num 0.054 -0.987 -1 -1 0.096 0.213 0.991 0.99 -1 0.152 ...
## $ p73 : num -1 -1 -1 -1 -1 -1 -0.45 -0.402 -0.552 -1 ...
## $ p74 : num -1 0.274 -0.709 -1 -1 -1 -1 -1 -0.918 -1 ...
## $ p75 : num -1 1 0.799 -1 -1 -1 -1 -1 -1 -1 ...
## $ p76 : num -1 -0.62 1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p77 : num -1 -1 0.834 -1 -1 -1 -1 -1 -0.46 -1 ...
## $ p78 : num -1 -1 -0.674 -1 -1 -1 -1 -1 1 -1 ...
## $ p79 : num -1 -1 -1 -1 -1 -1 -1 -1 0.824 -1 ...
## $ p80 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p81 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p82 : num -1 -1 -1 -1 -1 -1 -1 -1 -0.965 -1 ...
## $ p83 : num -1 -1 -0.438 -0.747 -1 -1 -1 -1 0.633 -1 ...
## $ p84 : num -1 -1 0.993 0.828 -1 -1 -1 -1 1 -1 ...
## $ p85 : num -1 -1 0.718 1 0.113 -1 -1 -1 0.949 -0.967 ...
## $ p86 : num -0.418 -0.735 -0.523 0.259 1 0.05 -1 -0.209 -0.307 0.365 ...
## $ p87 : num 0.942 0.98 -1 -0.984 0.829 0.894 0.139 0.998 -0.963 0.635 ...
## $ p88 : num -0.781 -0.421 -1 -1 -0.863 -0.747 0.761 0.364 -1 -0.935 ...
## $ p89 : num -1 -0.532 -0.852 -1 -1 -1 -0.988 -1 -1 -1 ...
## $ p90 : num -1 0.88 0.782 -1 -1 -1 -1 -1 -1 -1 ...
## $ p91 : num -1 0.82 1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p92 : num -1 -0.96 0.852 -1 -1 -1 -1 -1 -0.707 -1 ...
## $ p93 : num -1 -1 -0.657 -1 -1 -1 -1 -1 0.408 -1 ...
## $ p94 : num -1 -1 -1 -1 -1 -1 -1 -1 1 -1 ...
## $ p95 : num -1 -1 -1 -1 -1 -1 -1 -1 0.72 -1 ...
## $ p96 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p97 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ p98 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## [list output truncated]
set.seed(769)
set.seed(769)
# Convert the response variable to a factor
data$digit <- as.factor(data$digit)
rf <- randomForest(digit ~ ., data, importance = TRUE)
rf
##
## Call:
## randomForest(formula = digit ~ ., data = data, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 16
##
## OOB estimate of error rate: 1.04%
## Confusion matrix:
## 6 8 class.error
## 6 822 12 0.014388489
## 8 4 704 0.005649718
plot(rf)
set.seed(769)
import <- importance(rf, type=1)
import_sorted <- import[order(-import[, "MeanDecreaseAccuracy"]), , drop = FALSE]
(top_2 <- head(import_sorted, 2))
## MeanDecreaseAccuracy
## p75 12.55577
## p91 11.58263
ops_rf <- randomForest(digit ~ p75 + p91, data)
ops_rf
##
## Call:
## randomForest(formula = digit ~ p75 + p91, data = data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 7.2%
## Confusion matrix:
## 6 8 class.error
## 6 774 60 0.07194245
## 8 51 657 0.07203390
The OOB error is 0.97%, if use two use two selected predictors the OOB is increase to 6.36%.
set.seed(769)
ari = double(5)
par(mfrow=c(3,2))
for(i in seq(2,7,1)){
cluster <- kmeans(subdata, centers = i)
with(subdata, plot(p75, p37, col = cluster$cluster + 2, main = paste0("k=", i)))
points(cluster$centers, bg=3:4, pch =21, cex=1.8, lwd =2)
ari[i-1] = adjustedRandIndex(cluster$cluster, subdata$digit)
}
ari
## [1] 0.9974059 0.8089484 0.7603472 0.6989755 0.6748412 0.6573466
In the particular when k =2 the ari is have a reletive high score 0.995 which inidcate the unsupervised learning method did a good job, when K increase it might cause overfitted.
# Calculate the distance matrix
d <- dist(subdata)
arihc <- double(5)
method <- c("complete", "single", "average", "centroid")
par(mfrow=c(2,3))
for (m in method){
# Perform hierarchical clustering with the specified method
hc <- hclust(d, method = m)
for (k in 2:7){
# cluster labels of oberservations 2 to 7
trees <- cutree(hc, k)
arihc[k-1] <- adjustedRandIndex(subdata$digit, trees)
# cat("Mehtod: ", m, " - K =", k, " ARI = ", round(arihc, 4), "\n")
with(subdata, plot(p75, p37, col = cluster$cluster + 2, main = paste0("Mehtod = ", m, " k=", k)))
points(cluster$centers, bg=3:4, pch =21, cex=1.8, lwd =2)
}
cat("Method: ", m, "ARI = ", arihc, "\n")
}
## Method: complete ARI = 1 0.820832 0.7816271 0.703652 0.6996163 0.6819863
## Method: single ARI = 1 0.9985977 0.9958 0.9944029 0.9930074 0.9916137
## Method: average ARI = 1 0.8286265 0.7300534 0.6908914 0.6739876 0.6739407
## Method: centroid ARI = 1 0.9958034 0.9618914 0.961851 0.7855862 0.6887279
# Create a new dataset with jittered values (sd = 0.1) to avoid overplotting
jittered_data <- as.data.frame(lapply(subdata, function(x) {
if (is.numeric(x)) jitter(x, factor = 0.1) else x
}))
plot(jittered_data$p75, jittered_data$p37, col = as.factor(data$digit), pch = as.numeric(as.factor(data$digit)),
main = "True Class Labels", xlab = "P75", ylab = "P37")
#### K-means
kmeans_result <- kmeans(subdata, centers = 2)
plot(jittered_data$p75, jittered_data$P37, col = kmeans_result$cluster, pch = kmeans_result$cluster + 1,
main = "K-means Clustering", xlab = "P75", ylab = "P37")
hc_complete <- hclust(d, method = "complete")
clusters_complete <- cutree(hc_complete, k = 2)
plot(jittered_data$p75, jittered_data$p37, col = clusters_complete, pch = clusters_complete + 1,
main = "Complete Linkage", xlab = "P75", ylab = "P37")
##### single linkage
hc_single <- hclust(d, method = "single")
clusters_single <- cutree(hc_single, k = 2)
plot(jittered_data$p75, jittered_data$p37, col = clusters_single, pch = clusters_single + 1,
main = "Single Linkage", xlab = "P75", ylab = "P37")
hc_average <- hclust(d, method = "average")
clusters_average <- cutree(hc_average, k = 2)
plot(jittered_data$p75, jittered_data$p37, col = clusters_average, pch = clusters_average + 1,
main = "Average Linkage", xlab = "P75", ylab = "P37")
hc_centroid <- hclust(d, method = "centroid")
clusters_centroid <- cutree(hc_centroid, k = 2)
plot(jittered_data$p75, jittered_data$p37, col = clusters_centroid, pch = clusters_centroid + 1,
main = "Centroid Linkage", xlab = "P75", ylab = "P37" )
Base on the result plot, that ARI is make sense, because when K =2 all kind of cluster are having similar performance therefore provide the ARI correctness.
train_data <- subdata[1:500,]
test_data <- subdata[501 :nrow(subdata), ]
Produce a classification plot for each value of cost, which also shows the decision boundary. You can either use the plot function provided in the e1071 package or write your own code (perhaps similar to mine). Also, add some jittering to the data (with sd=0.1, say), and also show the support vectors, at the same locations as the jittered observations that they are associated with.
for (i in c(0.001, 0.01, 0.1, 1, 10, 100)){
r = svm(digit ~ ., data= train_data, scale = F, kernel = "linear", cost = i)
plot(r, train_data)
mtext(paste0("cost", i))
yhat = predict(r,newdata = test_data)
# Create a confusion matrix to compare the actual 'class' labels from the test data with the predicted class labels from the Naive Bayes mode
print(table(test_data$digit, yhat))
# Calculate the accuracy of the model by checking the proportion of correct predictions
A = mean(test_data$digit == yhat)
cat(paste0("cost = ",i, " Accuracy rate ", A, '\n'))
}
## yhat
## 6 8
## 6 562 0
## 8 390 90
## cost = 0.001 Accuracy rate 0.625719769673704
## yhat
## 6 8
## 6 544 18
## 8 64 416
## cost = 0.01 Accuracy rate 0.921305182341651
## yhat
## 6 8
## 6 532 30
## 8 45 435
## cost = 0.1 Accuracy rate 0.928023032629559
## yhat
## 6 8
## 6 526 36
## 8 37 443
## cost = 1 Accuracy rate 0.929942418426104
## yhat
## 6 8
## 6 526 36
## 8 36 444
## cost = 10 Accuracy rate 0.930902111324376
## yhat
## 6 8
## 6 526 36
## 8 36 444
## cost = 100 Accuracy rate 0.930902111324376
From the plot above, that the cost indicate the accuracy rate, higher accuracy rate come with higher cost value.
From output Q6 that the accuracy rate have a major gap between cost 0.001 with the rest, which indicates that with cost 0.01 SVM are able to classify over 90% of class.
for (i in c(0.001, 0.01, 0.1, 1, 10, 100)){
r = svm(digit ~ ., data= train_data, scale = F, kernel = "radial", cost = 1, gamma = i)
plot(r, train_data )
mtext(paste0("gamma", i))
yhat = predict(r,newdata = test_data)
# Create a confusion matrix to compare the actual 'class' labels from the test data with the predicted class labels from the Naive Bayes mode
print(table(test_data$digit, yhat))
# Calculate the accuracy of the model by checking the proportion of correct predictions
A = mean(test_data$digit == yhat)
cat(paste0("gamma = ",i, " Accuracy rate ", A, '\n'))
}
## yhat
## 6 8
## 6 559 3
## 8 158 322
## gamma = 0.001 Accuracy rate 0.845489443378119
## yhat
## 6 8
## 6 537 25
## 8 53 427
## gamma = 0.01 Accuracy rate 0.925143953934741
## yhat
## 6 8
## 6 529 33
## 8 38 442
## gamma = 0.1 Accuracy rate 0.931861804222649
## yhat
## 6 8
## 6 529 33
## 8 36 444
## gamma = 1 Accuracy rate 0.933781190019194
## yhat
## 6 8
## 6 529 33
## 8 35 445
## gamma = 10 Accuracy rate 0.934740882917466
## yhat
## 6 8
## 6 525 37
## 8 32 448
## gamma = 100 Accuracy rate 0.933781190019194
The gamma effect helps improve the performance compare to the change the cost.
Yes, the radial kernel helps, from the result that initial accuracy rate is improve a lot, but with the gamma increase the performance till gamma = 10 reach to the peak, then drop down.
train_256 <- zip[1:500,]
test_256 <- zip[501 :nrow(zip), ]
values <- c(0.001, 0.01, 0.1, 1, 10, 100)
rt = tune(svm, digit ~ ., data=train_256, kernel="radial", scale=FALSE, ranges = list(cost = values, gamma = values),validation.x=10 )
summary(rt)
model <-rt$best.model
# Make predictions on the train set
yhat_train <- predict(model, newdata = train_256)
# Calculate and print accuracy on the training set
train_accuracy <- 1- mean(train_256$digit == yhat_train)
cat("Training Set Accuracy: ", sprintf("%.10f", train_accuracy), "\n")
# Make predictions on the test set
yhat_test <- predict(rt$best.model, newdata = test_256)
# Calculate and print accuracy on the test set
test_accuracy <- 1- mean(yhat_test == test_256$digit)
cat("Test Set Accuracy: ", sprintf("%.10f", test_accuracy), "\n")
Parameter tuning of ‘svm’:
- sampling method: 10-fold cross validation
- best parameters:
cost gamma
10 0.01
- best performance: 1.615847
- Detailed performance results:
cost gamma error dispersion
1e-03 1e-03 8.720153 0.8214717
1e-02 1e-03 8.576763 0.8004872
1e-01 1e-03 7.482756 0.6376604
1e+00 1e-03 3.950381 0.7507093
1e+01 1e-03 2.594183 0.8554863
1e+02 1e-03 2.120036 0.6751115
1e-03 1e-02 8.685074 0.8179124
1e-02 1e-02 8.242179 0.7692930
1e-01 1e-02 5.556555 0.6460357
1e+00 1e-02 2.088004 0.7254578
1e+01 1e-02 1.615847 0.6433051
1e+02 1e-02 1.621082 0.6415757
1e-03 1e-01 8.728804 0.8223642
1e-02 1e-01 8.662643 0.8097531
1e-01 1e-01 8.143349 0.7247699
1e+00 1e-01 7.380374 0.8001499
1e+01 1e-01 7.009282 0.8914761
1e+02 1e-01 7.009282 0.8914761
1e-03 1e+00 8.736279 0.8236107
1e-02 1e+00 8.735986 0.8217589
1e-01 1e+00 8.734875 0.8037311
1e+00 1e+00 8.744292 0.7314770
1e+01 1e+00 8.576852 0.7594888
1e+02 1e+00 8.576852 0.7594888
1e-03 1e+01 8.736402 0.8236676
1e-02 1e+01 8.737213 0.8223241
1e-01 1e+01 8.746837 0.8090230
1e+00 1e+01 8.843988 0.7622132
1e+01 1e+01 8.765956 0.8204965
1e+02 1e+01 8.765956 0.8204965
1e-03 1e+02 8.736402 0.8236676
1e-02 1e+02 8.737213 0.8223241
1e-01 1e+02 8.746837 0.8090231
1e+00 1e+02 8.843989 0.7622142
1e+01 1e+02 8.765958 0.8204991
1e+02 1e+02 8.765958 0.8204991
Training Set Accuracy: 1.0000000000
Test Set Accuracy: 1.0000000000
Random Forest identified the top two predictors for differentiating digits 6 and 8, achieving an OOB error of 0.97%.
A second subset was created by selecting the top predictor and the least correlated variable from the top 10, yielding an OOB error of 6.36%.
K-means Clustering: K-means was applied to partition the data into 2 to 7 clusters. The adjusted Rand index (ARI) for K=2 was 0.995, indicating effective clustering.
Hierarchical Clustering: Clustering was repeated with different linkage methods (complete, single, average, centroid). ARI values confirmed similar performance to K-means.
Scatter plots were produced for each partitioning method to visualize clustering results.
Linear Kernel: SVMs were trained with different cost values (0.001, 0.01, 0.1, 1, 10, 100). Higher cost values improved accuracy, but increased complexity.
Radial Kernel: SVMs with a radial kernel were trained with varying gamma values, showing improved accuracy compared to the linear kernel. Gamma = 10 provided the best performance.
Training and test errors were evaluated for each model, highlighting the effect of cost and gamma on decision boundaries and support vectors.